- name: d8.deckhouse.availability
  rules:
  - alert: D8DeckhouseSelfTargetDown
    expr: max by (job) (up{job="deckhouse", scrape_source="self"} == 0)
    for: 2m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      d8_ignore_on_update: "true"
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      plk_ignore_labels: "job"
      summary: Prometheus is unable to scrape Deckhouse metrics.

  - alert: D8DeckhouseCustomTargetDown
    expr: max by (job) (up{job="deckhouse", scrape_source="custom"} == 0)
    for: 10m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      d8_ignore_on_update: "true"
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      plk_ignore_labels: "job"
      summary: Prometheus is unable to scrape custom metrics generated by Deckhouse hooks.

  - alert: D8DeckhouseSelfTargetAbsent
    expr: absent(up{job="deckhouse", scrape_source="self"}) == 1
    for: 2m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      d8_ignore_on_update: "true"
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      summary: There is no Deckhouse target in Prometheus.

  - alert: D8DeckhousePodIsNotReady
    expr: |
      min by (pod) (
        kube_controller_pod{namespace="d8-system", controller_type="Deployment", controller_name="deckhouse"}
        * on (pod) group_right() kube_pod_status_ready{condition="true", namespace="d8-system"}
      ) != 1
    for: 10m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      d8_ignore_on_update: "true"
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "pod"
      summary: The Deckhouse Pod is NOT Ready.

  - alert: D8DeckhousePodIsNotRunning
    expr: |
      absent(
        kube_controller_pod{namespace="d8-system", controller_type="Deployment", controller_name="deckhouse"}
        * on (pod) group_right() kube_pod_status_phase{namespace="d8-system",phase="Running"}
      )
    for: 2m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      d8_ignore_on_update: "true"
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      summary: The Deckhouse Pod is NOT Running.

  - alert: D8DeckhouseIsHung
    expr: max without (container, job) (increase(deckhouse_live_ticks[__SCRAPE_INTERVAL_X_4__])) < 1
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is down.
      description: |
        Deckhouse is probably down since the `deckhouse_live_ticks` metric in Prometheus is no longer increasing (it is supposed to increment every 10 seconds).

- name: d8.deckhouse.malfunctioning
  rules:
  - alert: D8DeckhousePodIsRestartingTooOften
    expr: |
      max by (pod) (
        kube_controller_pod{namespace="d8-system", controller_type="Deployment", controller_name="deckhouse"}
        * on (pod) group_right() increase(kube_pod_container_status_restarts_total{namespace="d8-system"}[1h])
        and
        kube_controller_pod{namespace="d8-system", controller_type="Deployment", controller_name="deckhouse"}
        * on (pod) group_right() kube_pod_container_status_restarts_total{namespace="d8-system"}
      ) > 3
    labels:
      severity_level: "9"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "pod"
      summary: Excessive Deckhouse restarts detected.
      description: |
        The number of restarts in the last hour: {{ $value }}.

        Excessive Deckhouse restarts indicate that something is wrong. Normally, Deckhouse should be up and running all the time.

        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseHasNoAccessToRegistry
    expr: max by (pod, instance) (increase(deckhouse_registry_errors[__SCRAPE_INTERVAL_X_4__])) > 0
    for: 1h
    labels:
      severity_level: "7"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is unable to connect to the registry.
      description: |
        Deckhouse is unable to connect to the registry (registry.deckhouse.io in most cases) to check for a new Docker image (checks are performed every 15 seconds). Deckhouse does not have access to the registry; automatic updates are not available.

        Usually, this alert means that the Deckhouse Pod is having difficulties with connecting to the Internet.

  - alert: D8DeckhouseQueueIsHung
    expr: max by (pod, instance, queue) (min_over_time(deckhouse_tasks_queue_length{queue!~"main-subqueue-kubernetes-.*|/modules/upmeter/update_selector.*|/modules/secret-copier|/modules/deckhouse/update_deckhouse_image"}[__SCRAPE_INTERVAL_X_3__])) != 0
    for: 10m
    labels:
      severity_level: "7"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: The {{ $labels.queue }} Deckhouse queue has hung; there are {{ $value }} task(s) in the queue.
      description: |
        Deckhouse cannot finish processing of the {{ $labels.queue }} queue with {{ $value }} tasks piled up.

        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseGlobalHookFailsTooOften
    for: 10m
    expr: |
      max by (pod, instance, hook) (
        increase(deckhouse_global_hook_errors_total{job="deckhouse"}[__SCRAPE_INTERVAL_X_4__])
        or
        increase(deckhouse_global_hook_allowed_errors_total{job="deckhouse"}[__SCRAPE_INTERVAL_X_4__])
      ) > 1
    labels:
      severity_level: "9"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: The {{ $labels.hook }} Deckhouse global hook crashes way too often.
      description: |
        The {{ $labels.hook }} has failed in the last `__SCRAPE_INTERVAL_X_4__`.

        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseModuleHookFailsTooOften
    for: 10m
    expr: |
      max by (pod, instance, module, hook) (
        increase(deckhouse_module_hook_errors_total{job="deckhouse"}[__SCRAPE_INTERVAL_X_4__])
        or
        increase(deckhouse_module_hook_allowed_errors_total{job="deckhouse"}[__SCRAPE_INTERVAL_X_4__])
      ) > 1
    labels:
      severity_level: "9"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: The {{ $labels.module }}/{{ $labels.hook }} Deckhouse hook crashes way too often.
      description: |
        The {{ $labels.hook }} hook of the {{ $labels.module }} module has failed in the last `__SCRAPE_INTERVAL_X_4__`.

        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseCouldNotDiscoverModules
    expr: max by (pod, instance) (increase(deckhouse_modules_discover_errors_total[__SCRAPE_INTERVAL_X_4__])) > 1
    for: 3m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is unable to discover modules.
      description: |
        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseCouldNotRunModule
    expr: max(increase(deckhouse_module_run_errors_total[__SCRAPE_INTERVAL_X_4__])) by (pod, instance, module) > 1
    for: 3m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is unable to start the {{ $labels.module }} module.
      description: |
        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseCouldNotDeleteModule
    expr: max(increase(deckhouse_module_delete_errors_total[__SCRAPE_INTERVAL_X_4__])) by (pod, instance, module) > 1
    for: 3m
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is unable to delete the {{ $labels.module }} module.
      description: |
        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseCouldNotRunGlobalHook
    expr: max(increase(deckhouse_global_hook_errors_total[__SCRAPE_INTERVAL_X_4__])) by (pod, instance, hook) > 1
    for: 3m
    labels:
      severity_level: "5"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is unable to run the {{ $labels.hook }} global hook.
      description: |
        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseCouldNotRunModuleHook
    expr: max(increase(deckhouse_module_hook_errors_total[__SCRAPE_INTERVAL_X_4__])) by (pod, instance, module, hook) > 1
    for: 3m
    labels:
      severity_level: "7"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_labels_as_annotations: "instance,pod"
      summary: Deckhouse is unable to run the {{ $labels.module }}/{{ $labels.hook }} module hook.
      description: |
        Please, refer to the corresponding logs: `kubectl -n d8-system logs -f -l app=deckhouse`.

  - alert: D8DeckhouseConfigInvalid
    expr: increase(deckhouse_config_values_errors_total[__SCRAPE_INTERVAL_X_2__]) > 0
    for: 1m
    labels:
      severity_level: "5"
      d8_module: deckhouse
      d8_component: deckhouse
      tier: cluster
    annotations:
      plk_markup_format: "markdown"
      plk_protocol_version: "1"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      description: |
        Deckhouse config contains errors.

        Please check Deckhouse logs by running `kubectl -n d8-system logs -f -l app=deckhouse`.

        Edit Deckhouse global configuration by running `kubectl edit mc global` or configuration of the specific module by running `kubectl edit mc <MODULE_NAME>`
      summary: |
        Deckhouse config is invalid.

  - alert: DeckhouseUpdating
    expr: max (d8_is_updating) == 1
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      summary: Deckhouse is being updated.

  - alert: DeckhouseUpdatingFailed
    expr: max (d8_updating_is_failed) == 1
    labels:
      severity_level: "4"
      tier: cluster
      d8_module: deckhouse
      d8_component: deckhouse
    annotations:
      plk_protocol_version: "1"
      plk_markup_format: "markdown"
      description: |
        Failed to update Deckhouse.

        Next version minor/path Deckhouse image is not available in the registry or the image is corrupted.
        Actual version: {{ $labels.version }}.

        Make sure that the next version Deckhouse image is available in the registry.

      summary: Deckhouse updating is failed.

  - alert: D8DeckhouseWatchErrorOccurred
    expr: increase(deckhouse_kubernetes_client_watch_errors_total[__SCRAPE_INTERVAL_X_2__]) > 0
    for: 1m
    labels:
      severity_level: "5"
      d8_module: deckhouse
      d8_component: deckhouse
      tier: cluster
    annotations:
      plk_markup_format: "markdown"
      plk_protocol_version: "1"
      plk_create_group_if_not_exists__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      plk_grouped_by__d8_deckhouse_malfunctioning: "D8DeckhouseMalfunctioning,tier=cluster,prometheus=deckhouse,kubernetes=~kubernetes"
      description: |
        Error occurred in the client-go informer, possible problems with connection to apiserver.

        Check Deckhouse logs for more information by running:
        `kubectl -n d8-system logs deploy/deckhouse | grep error | grep -i watch`

        This alert is an attempt to detect the correlation between the faulty snapshot invalidation
        and apiserver connection errors, especially for the handle-node-template hook in the node-manager module.
        Check the difference between the snapshot and actual node objects for this hook:
        `diff -u <(kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'|sort) <(kubectl -n d8-system exec deploy/deckhouse -- deckhouse-controller module snapshots node-manager -o json | jq '."040-node-manager/hooks/handle_node_templates.go"' | jq '.nodes.snapshot[] | .filterResult.Name' -r | sort)`

      summary: |
        Possible apiserver connection error in the client-go informer, check logs and snapshots.

  - alert: D8DeckhouseDeprecatedConfigmapManagedByArgoCD
    expr: |
      d8_deprecated_configmap_managed_by_argocd > 0
    labels:
      tier: cluster
      severity_level: "4"
    annotations:
      plk_markup_format: markdown
      plk_protocol_version: "1"
      for: "10m"
      summary: Deprecated deckhouse configmap managed by Argo CD
      description: |
        The deckhouse configmap is no longer used.
        You need to remove configmap "d8-system/deckhouse" from ArgoCD
